%matplotlib inline
import pandas as pd
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv("data.csv")
df.head()
list(df.columns)
def rank(df, column, countup='unique_mos_id'):
new_df = df.groupby(column).count().reset_index()[[column, countup]]
new_df.columns = [column, 'count']
new_df = new_df.sort_values('count', ascending=False)
return new_df
officer_df = df[['unique_mos_id', 'first_name', 'last_name', 'rank_now', 'mos_ethnicity', 'mos_gender']]
officer_df.drop_duplicates(inplace=True)
rank(officer_df, 'mos_gender')
rank(officer_df, 'mos_ethnicity')
rank(officer_df, 'rank_now')
num_complaint_df = rank(df, 'unique_mos_id', countup='first_name')
num_bins = 75
fig, ax = plt.subplots()
n, bins, patches = ax.hist(num_complaint_df['count'], num_bins)
ax.set_xlabel('Officers')
ax.set_ylabel('Complaints')
ax.set_title(r'Histogram of Complaints per Cop')
# Tweak spacing to prevent clipping of ylabel
fig.tight_layout()
plt.show()
10% of cops are responsible for 70% of complaints.
num_bins = 75
fig, ax = plt.subplots()
# the histogram of the data
n, bins, patches = ax.hist(
num_complaint_df['count'], num_bins, cumulative=True, density=True, stacked=True)
ax.set_xlabel('Officers')
ax.set_ylabel('Complaints')
ax.set_title(r'CDF of Complaints per Cop')
# Tweak spacing to prevent clipping of ylabel
fig.tight_layout()
plt.show()
precinct_df = rank(df, 'precinct')
precinct_df.precinct = precinct_df.precinct.map(lambda x: int(x))
precinct_df.head(10)
The top precinct in particular (75, East NY - Brooklyn) is twice as bad as the next.
import folium
nyc_coor = [40.75,-73.8759]
# instatiate a folium map object with the above coordinate at center
complaint_map = folium.Map(location=nyc_coor,zoom_start=10)
# the path to the geojson file of the manhattan precincts
pathgeo = './precincts.geojson'
# make the chorlopleth map
complaint_map.choropleth(geo_data=pathgeo,
data=precinct_df,
columns=['precinct', 'count'],
key_on='feature.properties.Precinct',
fill_color='BuPu',
fill_opacity=0.7,
line_opacity=0.2,
legend_name='Complaints per precinct')
# show the map
complaint_map
set(df.board_disposition)
board_results_df = rank(df, 'board_disposition')
board_results_df
75% of the time there's no punishment at all.
(15448 + 9609) / sum(board_results_df['count'])
How long do cases stay open?
duration_df = df[['month_received', 'year_received', 'month_closed', 'year_closed']]
duration_df['months_open'] = 12 * (duration_df.year_closed - duration_df.year_received) + \
(duration_df.month_closed - duration_df.month_received)
duration_df.months_open.mean()
rank(df, 'fado_type')
allegation_df = rank(df, 'allegation')
allegation_df.head(10)